# Copyright (c) HySoP 2011-2024
#
# This file is part of HySoP software.
# See "https://particle_methods.gricad-pages.univ-grenoble-alpes.fr/hysop-doc/"
# for further info.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from hysop import vprint, dprint, __KERNEL_DEBUG__, __TRACE_KERNELS__
from hysop.constants import Backend
from hysop.tools.profiler import FProfiler
from hysop.tools.decorators import debug
from hysop.tools.htypes import check_instance, first_not_None, to_list
from hysop.tools.misc import prod
from hysop.tools.units import bytes2str
from hysop.tools.numpywrappers import npw
from hysop.core.arrays.all import Array, HostArray, OpenClArray
from hysop.backend.device.opencl import cl, clArray
from hysop.backend.device.opencl.opencl_kernel_launcher import (
OpenClKernelLauncher,
trace_kernel,
profile_kernel,
)
from hysop.backend.device.opencl.opencl_kernel_statistics import OpenClKernelStatistics
[docs]
class OpenClCopyKernelLauncher(OpenClKernelLauncher):
"""Interface to non-blocking OpenCL copy kernels."""
@debug
def __init__(self, name, dst, src, enqueue_copy_kwds, apply_msg, **kwds):
"""
Initialize an OpenClCopyKernelLauncher.
Parameters
----------
enqueue_copy_kwds: dict
Arguments to to passed to pyopencl.enqueue_copy.
"""
assert "default_global_work_size" not in kwds
assert "default_local_work_size" not in kwds
assert "is_blocking" not in kwds
enqueue_copy_kwds["dest"] = dst
enqueue_copy_kwds["src"] = src
if isinstance(src, np.ndarray) or isinstance(dst, np.ndarray):
enqueue_copy_kwds["is_blocking"] = False
super().__init__(name=name, kernel=None, args_list=(), **kwds)
self._enqueue_copy_kwds = enqueue_copy_kwds
self._apply_msg = apply_msg
def _get_enqueue_copy_kwds(self):
"""
Return a copy of the keywords arguments that will be passed
to pyopencl.enqueue_copy.
"""
return dict(self._enqueue_copy_kwds.items())
def __call__(self, queue=None, wait_for=None, **kwds):
trace_kernel(" " + self._apply_msg)
queue = first_not_None(queue, self._default_queue)
if wait_for is not None:
wait_for = to_list(wait_for)
check_instance(queue, cl.CommandQueue)
evt = cl.enqueue_copy(queue=queue, wait_for=wait_for, **self._enqueue_copy_kwds)
profile_kernel(None, evt, self._apply_msg, fprofiler=self._profiler)
return evt
enqueue_copy_kwds = property(_get_enqueue_copy_kwds)
[docs]
class OpenClCopyBufferLauncher(OpenClCopyKernelLauncher):
"""Non-blocking OpenCL copy kernel between host buffers and/or opencl device buffers."""
def __init__(
self,
varname,
src,
dst,
src_device_offset=None,
dst_device_offset=None,
byte_count=None,
**kwds,
):
"""
Initialize a (HOST <-> DEVICE) or a (DEVICE <-> DEVICE) copy kernel.
Parameters
----------
varname: str
Name of the variable copied for loggin purposes.
src: cl.MemoryObjectHolder or np.ndarray
The source buffer.
dst: cl.MemoryObjectHolder or np.ndarray
The destination buffer.
src_device_offset: int, optional
Offset in the source buffer, only valid if
source buffer is a cl.MemoryObjectHolder.
dst_device_offset: int, optional
Offset in the source buffer, only valid if
source buffer is a cl.MemoryObjectHolder.
byte_count: int
Byte count to copy if and only if source and destination
buffers are cl.MemoryObjectHolders.
Notes
-----
The size of the transfer is controlled by the size of the of the host-side buffer.
If the host-side buffer is a numpy.ndarray, you can control the transfer size
by transfering into a smaller view of the target array by using indexing,
If neither src nor dst are host buffers, the size is controlled by the
parameter byte_count.
Device buffers cannot have views like np.ndarrays, an offset in bytes can
be given as src_device_offset or dst_device_offset instead.
"""
check_instance(src, (cl.MemoryObjectHolder, np.ndarray))
check_instance(dst, (cl.MemoryObjectHolder, np.ndarray))
check_instance(src_device_offset, (int, np.integer), allow_none=True)
check_instance(dst_device_offset, (int, np.integer), allow_none=True)
check_instance(byte_count, (int, np.integer), allow_none=True)
msg = "Host to host copy is not supported."
assert not (isinstance(src, np.ndarray) and isinstance(dst, np.ndarray)), msg
enqueue_copy_kwds = {}
if src_device_offset is not None:
assert isinstance(src, cl.MemoryObjectHolder)
enqueue_copy_kwds["src_offset"] = src_device_offset
if dst_device_offset is not None:
assert isinstance(dst, cl.MemoryObjectHolder)
enqueue_copy_kwds["dst_offset"] = dst_device_offset
if byte_count is not None:
assert isinstance(src, cl.MemoryObjectHolder)
assert isinstance(dst, cl.MemoryObjectHolder)
enqueue_copy_kwds["byte_count"] = byte_count
shape = first_not_None(
(byte_count,),
getattr(src, "shape", None),
getattr(dst, "shape", None),
"...",
)
assert "name" not in kwds
name = "enqueue_copy_{}__{}_to_{}".format(
varname,
"host" if isinstance(src, np.ndarray) else "device",
"host" if isinstance(dst, np.ndarray) else "device",
)
apply_msg = f"{name}<<<{shape}>>>"
super().__init__(
dst=dst,
src=src,
enqueue_copy_kwds=enqueue_copy_kwds,
name=name,
apply_msg=apply_msg,
**kwds,
)
def _format_host_arg(self, arg):
if isinstance(arg, HostArray):
arg = arg.data
nbytes = arg.nbytes
elif isinstance(arg, np.ndarray):
nbytes = arg.size * arg.dtype.itemsize
else:
msg = "Unknown type {} to format device buffer arguments."
msg = msg.format(type(arg))
raise TypeError(msg)
return arg, nbytes
def _format_device_arg(self, arg, arg_offset):
nbytes = None
if isinstance(arg, (OpenClArray, clArray.Array)):
arg_offset = first_not_None(arg_offset, 0)
nbytes = arg.nbytes - arg_offset
arg_offset += arg.offset
arg = arg.base_data
elif isinstance(arg, cl.MemoryObjectHolder):
pass
else:
msg = "Unknown type {} to format device buffer arguments."
msg = msg.format(type(arg))
raise TypeError(msg)
return (arg, arg_offset, nbytes)
[docs]
class OpenClCopyHost2DeviceLauncher(OpenClCopyBufferLauncher):
"""Reduced interface for host to device copy kernels."""
def __init__(self, varname, src, dst, dst_device_offset=None):
src, src_nbytes = self._format_host_arg(src)
dst, dst_device_offset, dst_nbytes = self._format_device_arg(
dst, dst_device_offset
)
check_instance(src, (np.ndarray,))
check_instance(dst, (cl.MemoryObjectHolder,))
check_instance(dst_device_offset, (int, np.integer), allow_none=True)
assert (
(src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes)
)
super().__init__(
varname=varname, src=src, dst=dst, dst_device_offset=dst_device_offset
)
[docs]
class OpenClCopyDevice2HostLauncher(OpenClCopyBufferLauncher):
"""Reduced interface for device to host copy kernels."""
def __init__(self, varname, src, dst, src_device_offset=None):
src, src_device_offset, src_nbytes = self._format_device_arg(
src, src_device_offset
)
dst, dst_nbytes = self._format_host_arg(dst)
check_instance(src, (cl.MemoryObjectHolder,))
check_instance(dst, (np.ndarray,))
check_instance(src_device_offset, (int, np.integer), allow_none=True)
assert (
(src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes)
)
super().__init__(
varname=varname, src=src, dst=dst, src_device_offset=src_device_offset
)
[docs]
class OpenClCopyDevice2DeviceLauncher(OpenClCopyBufferLauncher):
"""Reduced interface for device to device copy kernels."""
def __init__(
self,
varname,
src,
dst,
src_device_offset=None,
dst_device_offset=None,
byte_count=None,
):
src, src_device_offset, src_nbytes = self._format_device_arg(
src, src_device_offset
)
dst, dst_device_offset, dst_nbytes = self._format_device_arg(
dst, dst_device_offset
)
byte_count = first_not_None(byte_count, min(src_nbytes, dst_nbytes))
check_instance(src, (cl.MemoryObjectHolder,))
check_instance(dst, (cl.MemoryObjectHolder,))
check_instance(src_device_offset, (int, np.integer), allow_none=True)
check_instance(dst_device_offset, (int, np.integer), allow_none=True)
check_instance(byte_count, (int, np.integer), allow_none=True)
assert (
(src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes)
)
super().__init__(
varname=varname,
src=src,
dst=dst,
src_device_offset=src_device_offset,
dst_device_offset=dst_device_offset,
byte_count=byte_count,
)
[docs]
class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher):
"""
Non-blocking OpenCL copy kernel between host buffers and/or opencl device
rectangle subregions of buffers (OpenCL 1.1 and newer).
Supports n-dimensional strided arrays with dimension greater than 3
via iterating over 3D subregions.
"""
def __init__(
self,
varname,
src,
dst,
copy_region,
copy_src_origin,
copy_dst_origin,
copy_src_pitches,
copy_dst_pitches,
iter_region=None,
iter_src_origin=None,
iter_dst_origin=None,
iter_src_pitches=None,
iter_dst_pitches=None,
**kwds,
):
"""
Initialize a (HOST <-> DEVICE) or a (DEVICE <-> DEVICE) rectangle
subregions copy kernel.
Parameters
----------
varname: str
Name of the variable copied for loggin purposes.
src: cl.MemoryObjectHolder or np.ndarray
The source buffer.
dst: cl.MemoryObjectHolder or np.ndarray
The destination buffer.
copy_region: tuple of ints
The 3D region to copy in terms of bytes for the
first dimension and of elements for the two last dimensions.
copy_src_origin: tuple of ints
The 3D offset in number of elements of the region associated with src buffer.
The final src offset in bytes is computed from src_origin and src_pitch.
copy_dst_origin: tuple of ints
The 3D offset in number of elements of the region associated with dst buffer.
The final dst offset in bytes is computed from dst_origin and dst_pitch.
copy_src_pitches: tuple of ints
The 2D pitches used to compute src offsets in bytes for
the second and the third dimension.
copy_dst_pitches: tuple of ints
The 2D pitches used to compute dst offsets in bytes for
the second and the third dimension.
iter_region: tuple of ints
The n-dimensional region to iterate if the copied region dimension is greater than 3.
iter_src_origin: tuple of ints
The n-dimensional src array origin if the copied region dimension is greater than 3.
iter_dst_origin: tuple of ints
The n-dimensional dst array origin if the copied region dimension is greater than 3.
iter_src_pitches: tuple of ints
The n-dimensional src array pitches if the copied region dimension is greater than 3.
iter_dst_pitches: tuple of ints
The n-dimensional dst array pitches if the copied region dimension is greater than 3.
kwds: dict
Base class arguments
"""
iter_region = first_not_None(iter_region, ())
iter_src_origin = first_not_None(iter_src_origin, ())
iter_dst_origin = first_not_None(iter_dst_origin, ())
iter_src_pitches = first_not_None(iter_src_pitches, ())
iter_dst_pitches = first_not_None(iter_dst_pitches, ())
check_instance(src, (cl.MemoryObjectHolder, np.ndarray))
check_instance(dst, (cl.MemoryObjectHolder, np.ndarray))
check_instance(copy_region, tuple, values=(int, np.integer), size=3)
check_instance(copy_src_origin, tuple, values=(int, np.integer), size=3)
check_instance(copy_dst_origin, tuple, values=(int, np.integer), size=3)
check_instance(copy_src_pitches, tuple, values=(int, np.integer), size=2)
check_instance(copy_dst_pitches, tuple, values=(int, np.integer), size=2)
n = len(iter_region)
check_instance(iter_region, tuple, values=(int, np.integer), size=n)
check_instance(iter_src_origin, tuple, values=(int, np.integer), size=n)
check_instance(iter_dst_origin, tuple, values=(int, np.integer), size=n)
check_instance(iter_src_pitches, tuple, values=(int, np.integer), size=n)
check_instance(iter_dst_pitches, tuple, values=(int, np.integer), size=n)
enqueue_copy_kwds = {}
enqueue_copy_kwds["region"] = copy_region
if isinstance(src, np.ndarray) and isinstance(dst, np.ndarray):
msg = "Host to host copy is not supported."
raise RuntimeError(msg)
elif isinstance(src, cl.MemoryObjectHolder) and isinstance(
dst, cl.MemoryObjectHolder
):
enqueue_copy_kwds["src_origin"] = copy_src_origin
enqueue_copy_kwds["src_pitches"] = copy_src_pitches
enqueue_copy_kwds["dst_origin"] = copy_dst_origin
enqueue_copy_kwds["dst_pitches"] = copy_dst_pitches
src_origin_kwd = "src_origin"
dst_origin_kwd = "dst_origin"
elif isinstance(src, cl.MemoryObjectHolder) and isinstance(dst, np.ndarray):
enqueue_copy_kwds["host_origin"] = copy_dst_origin
enqueue_copy_kwds["host_pitches"] = copy_dst_pitches
enqueue_copy_kwds["buffer_origin"] = copy_src_origin
enqueue_copy_kwds["buffer_pitches"] = copy_src_pitches
src_origin_kwd = "buffer_origin"
dst_origin_kwd = "host_origin"
elif isinstance(src, np.ndarray) and isinstance(dst, cl.MemoryObjectHolder):
enqueue_copy_kwds["host_origin"] = copy_src_origin
enqueue_copy_kwds["host_pitches"] = copy_src_pitches
enqueue_copy_kwds["buffer_origin"] = copy_dst_origin
enqueue_copy_kwds["buffer_pitches"] = copy_dst_pitches
src_origin_kwd = "host_origin"
dst_origin_kwd = "buffer_origin"
else:
msg = "The impossible happened.\n *src={}\n *dst={}"
msg = msg.format(type(src), type(dst))
raise ValueError(msg)
assert "name" not in kwds
name = "enqueue_copy_rect_{}__{}_to_{}".format(
varname,
"host" if isinstance(src, np.ndarray) else "device",
"host" if isinstance(dst, np.ndarray) else "device",
)
apply_msg = "{}<<<{}>>>()"
apply_msg = apply_msg.format(name, copy_region)
# if iteration is required, we redefine __call__
if n > 0:
apply_msg += f" iterated over ndindex {iter_region}"
assert src_origin_kwd in enqueue_copy_kwds
assert dst_origin_kwd in enqueue_copy_kwds
src_origin = enqueue_copy_kwds.pop(src_origin_kwd)
dst_origin = enqueue_copy_kwds.pop(dst_origin_kwd)
super().__init__(
dst=dst,
src=src,
enqueue_copy_kwds=enqueue_copy_kwds,
name=name,
apply_msg=apply_msg,
**kwds,
)
if n > 0:
def call(
queue=None,
wait_for=None,
iter_region=iter_region,
iter_src_origin=iter_src_origin,
iter_dst_origin=iter_dst_origin,
iter_src_pitches=iter_src_pitches,
iter_dst_pitches=iter_dst_pitches,
**kwds,
):
if __KERNEL_DEBUG__ or __TRACE_KERNELS__:
print(" " + self._apply_msg)
queue = first_not_None(queue, self._default_queue)
check_instance(queue, cl.CommandQueue)
for idx in npw.ndindex(*iter_region):
src_byte_offset = npw.dot(
npw.add(iter_src_origin, idx), iter_src_pitches
)
dst_byte_offset = npw.dot(
npw.add(iter_dst_origin, idx), iter_dst_pitches
)
_src_origin = (src_origin[0] + src_byte_offset,) + src_origin[1:]
_dst_origin = (dst_origin[0] + dst_byte_offset,) + dst_origin[1:]
enqueue_copy_kwds[src_origin_kwd] = _src_origin
enqueue_copy_kwds[dst_origin_kwd] = _dst_origin
evt = cl.enqueue_copy(
queue=queue, wait_for=wait_for, **enqueue_copy_kwds
)
profile_kernel(None, evt, self._apply_msg, fprofiler=self._profiler)
wait_for = None
return evt
self.call = call
else:
self.call = None
def __call__(self, *args, **kwds):
if self.call is None:
return super().__call__(*args, **kwds)
else:
return self.call(*args, **kwds)
@classmethod
def _format_slices(cls, a, slices):
check_instance(a, (np.ndarray, clArray.Array, Array))
shape = a.shape
dtype = a.dtype
ndim = a.ndim
if (not slices) or (slices is Ellipsis):
slices = (Ellipsis,)
check_instance(slices, tuple)
# expand ellipsis
if Ellipsis in slices:
nellipsis = slices.count(Ellipsis)
msg = "Only one Ellipsis can be passed."
assert nellipsis == 1, msg
eid = slices.index(Ellipsis)
missing_count = ndim - len(slices)
missing_slices = tuple(
slice(shape[i]) for i in range(eid, eid + missing_count + 1)
)
full_slices = slices[:eid] + missing_slices + slices[eid + 1 :]
slices = full_slices
check_instance(slices, tuple, values=(int, slice), size=ndim)
# compute indices
indices = ()
for slc, si in zip(slices, shape):
if (slc.stop is not None) and (slc.stop > si):
msg = "Error in slice specification: slc={} but size is only {}."
msg = msg.format(slc, si)
raise ValueError(msg)
if isinstance(slc, slice):
indices += (slc.indices(si),)
else:
indices += ((slc, slc + 1, 1),)
nelems = tuple((idx[1] - idx[0] + idx[2] - 1) // idx[2] for idx in indices)
nbytes = prod(nelems) * dtype.itemsize
return slices, dtype, nelems, nbytes, indices
@classmethod
def _compute_region(cls, a, indices):
# compute nelems and parameters
check_instance(indices, tuple, values=tuple, size=a.ndim)
start_offset = 0
if isinstance(a, (np.ndarray,)):
data = a
elif isinstance(a, (HostArray,)):
data = a.handle
else:
try:
data = a.data
except clArray.ArrayHasOffsetError:
data = a.base_data
start_offset = a.offset
if isinstance(a, Array):
a = a.handle
shape = a.shape
strides = a.strides
dtype = a.dtype
estart = tuple(idx[0] for idx in indices)
estop = tuple(idx[1] for idx in indices)
estep = tuple(idx[2] for idx in indices)
assert len(shape) == len(strides) == len(estep) == len(estart) == len(estop)
_estart, _estop, _estep = (
npw.asintegerarray(_) for _ in (estart, estop, estep)
)
if ((_estart % _estep) != 0).any():
msg = "Start is not aligned on step, cannot compute origin."
raise ValueError(msg)
if ((_estop % _estep) != 0).any():
msg = "Stop is not aligned on step, cannot compute region."
raise ValueError(msg)
if estep[-1] != 1:
msg = "Array is not contiguous (last slice step should be 1)."
raise ValueError(msg)
if strides[-1] != dtype.itemsize:
msg = "Array is not contiguous (last strides should be item size)."
raise ValueError(msg)
region, origin, pitches = (), (), ()
for Si, Sr, start, stop, step in zip(shape, strides, estart, estop, estep):
Ni = (stop - start + step - 1) // step
if (Ni <= 0) or (Ni > Si):
msg = f"Ni={Ni}, Si={Si}"
raise ValueError(msg)
elif (not region) or (Ni <= Si):
region += (Ni,)
origin += (start // step,)
pitches += (step * Sr,)
region = np.asarray(region, dtype=np.int32)
origin = np.asarray(origin, dtype=np.int32)
pitches = np.asarray(pitches, dtype=np.int32)
assert pitches[-1] == dtype.itemsize
pitches = pitches[:-1]
region[-1] *= dtype.itemsize
origin[-1] *= dtype.itemsize
origin[-1] += start_offset
return data, region, origin, pitches
[docs]
@classmethod
def from_slices(cls, varname, src, dst, src_slices=None, dst_slices=None):
"""
Build an OpenClCopyBufferRectLauncher from source, destinations
and some slices.
Device arrays must be aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN.
"""
assert hasattr(src, "shape")
assert hasattr(src, "dtype")
assert hasattr(src, "strides")
assert hasattr(dst, "shape")
assert hasattr(dst, "dtype")
assert hasattr(dst, "strides")
msg0 = "OpenClCopyBufferRectLauncher.from_slices()"
msg0 += "\n *Inputs were:"
msg0 += "\n src: shape={}, dtype={}, slices={}"
msg0 += "\n dst: shape={}, dtype={}, slices={}"
msg0 += "\n *Slices conversions were:"
msg0 += "\n src_slices: {}"
msg0 += "\n dst_slices: {}"
msg0 = msg0.format(
src.shape,
src.dtype,
src_slices,
dst.shape,
dst.dtype,
dst_slices,
"{}",
"{}",
)
src_slices, src_dtype, src_nelems, src_bytes, src_indices = cls._format_slices(
src, src_slices
)
dst_slices, dst_dtype, dst_nelems, dst_bytes, dst_indices = cls._format_slices(
dst, dst_slices
)
msg0 = msg0.format(src_slices, dst_slices)
if src_bytes != dst_bytes:
msg0 += (
"\n >Error: byte size mismatch between source and destination slices:"
)
else:
msg0 += "\n *Data types and byte count:"
msg0 += "\n src: nelems={}, dtype={}, bytes={} ({}B)"
msg0 += "\n dst: nelems={}, dtype={}, bytes={} ({}B)"
msg0 = msg0.format(
src_nelems,
src_dtype,
bytes2str(src_bytes),
src_bytes,
dst_nelems,
dst_dtype,
bytes2str(dst_bytes),
dst_bytes,
)
if src_bytes != dst_bytes:
raise ValueError(msg0)
src_data, src_region, src_origin, src_pitches = cls._compute_region(
src, src_indices
)
dst_data, dst_region, dst_origin, dst_pitches = cls._compute_region(
dst, dst_indices
)
if (src_region != dst_region).any():
msg0 += "\n >Error: mismatch between source and destination regions:"
else:
msg0 += "\n *Determined regions:"
msg0 += "\n src: region={}, origin={}, pitches={}"
msg0 += "\n dst: region={}, origin={}, pitches={}"
msg0 = msg0.format(
src_region, src_origin, src_pitches, dst_region, dst_origin, dst_pitches
)
if (src_region != dst_region).any():
raise ValueError(msg0)
region = src_region
if (region <= 0).any():
msg = "\n >Error: region is ill-formed or zero-sized:"
msg += "\n region: {}"
msg = msg.format(region)
raise ValueError(msg0 + msg)
total_dims = src_region.size
copy_dims = min(total_dims, 3)
iter_dims = total_dims - copy_dims
assert copy_dims > 0
assert iter_dims >= 0
zero, one = np.int32(0), np.int32(1)
copy_region = [one] * 3
copy_src_origin, copy_dst_origin = [zero] * 3, [zero] * 3
copy_src_pitches, copy_dst_pitches = [zero] * 2, [zero] * 2
copy_region[:copy_dims] = region[::-1][:copy_dims]
copy_src_origin[:copy_dims] = src_origin[::-1][:copy_dims]
copy_dst_origin[:copy_dims] = dst_origin[::-1][:copy_dims]
copy_src_pitches[: copy_dims - 1] = src_pitches[::-1][: copy_dims - 1]
copy_dst_pitches[: copy_dims - 1] = dst_pitches[::-1][: copy_dims - 1]
copy_region = tuple(copy_region)
copy_src_origin = tuple(copy_src_origin)
copy_dst_origin = tuple(copy_dst_origin)
copy_src_pitches = tuple(copy_src_pitches)
copy_dst_pitches = tuple(copy_dst_pitches)
iter_region = tuple(region[:iter_dims])
iter_src_origin = tuple(src_origin[:iter_dims])
iter_dst_origin = tuple(dst_origin[:iter_dims])
iter_src_pitches = tuple(src_pitches[:iter_dims])
iter_dst_pitches = tuple(dst_pitches[:iter_dims])
msg0 += "\n *Dimensions:"
msg0 += "\n total: {}"
msg0 += "\n copy: {}"
msg0 += "\n iter: {}"
msg0 = msg0.format(total_dims, copy_dims, iter_dims)
msg0 += "\n *enqueue_copy kernel arguments:"
msg0 += "\n region: {}"
msg0 += "\n src: origin={}, pitches={}"
msg0 += "\n dst: origin={}, pitches={}"
msg0 = msg0.format(
copy_region,
copy_src_origin,
copy_src_pitches,
copy_dst_origin,
copy_dst_pitches,
)
msg0 += "\n *iter arguments:"
msg0 += "\n region: {}"
msg0 += "\n src: origin={}, pitches={}"
msg0 += "\n dst: origin={}, pitches={}"
msg0 = msg0.format(
iter_region,
iter_src_origin,
iter_src_pitches,
iter_dst_origin,
iter_dst_pitches,
)
return cls(
varname=varname,
src=src_data,
dst=dst_data,
copy_region=copy_region,
copy_src_origin=copy_src_origin,
copy_dst_origin=copy_dst_origin,
copy_src_pitches=copy_src_pitches,
copy_dst_pitches=copy_dst_pitches,
iter_region=iter_region,
iter_src_origin=iter_src_origin,
iter_dst_origin=iter_dst_origin,
iter_src_pitches=iter_src_pitches,
iter_dst_pitches=iter_dst_pitches,
)
[docs]
class OpenClFillKernelLauncher(OpenClCopyBufferRectLauncher):
"""Cache buffers to perform a fill operation by using an OpenClCopyBufferRectLauncher."""
__fill_buffers = {}
[docs]
@classmethod
def from_slices(cls, varname, backend, fill_value, dst):
if isinstance(dst, OpenClArray):
assert backend == dst.backend
else:
assert isinstance(dst, clArray.Array)
shape = dst.shape
dtype = dst.dtype
fill_value = dst.dtype.type(fill_value)
src = cls.create_fill_buffer(backend, dtype, shape, fill_value)
obj = super().from_slices(varname=varname, src=src, dst=dst)
return obj
[docs]
@classmethod
def create_fill_buffer(cls, backend, dtype, shape, fill_value):
assert backend.kind == Backend.OPENCL
from hysop.tools.misc import prod
size = prod(shape)
key = (backend, dtype, size, fill_value)
if key in cls.__fill_buffers:
buf = cls.__fill_buffers[key]
else:
buf = backend.full(dtype=dtype, shape=shape, fill_value=fill_value)
cls.__fill_buffers[key] = buf
return buf.reshape(shape)